Submitted by:
1. Nardine Yousry Kamel Megali 18P6427
--I did the implementation of Naive Bayes as bonus--
import numpy as np
import pandas as pd
import pylab as plt
import scipy.spatial.distance as scidist
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from scipy.optimize import curve_fit
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score,roc_curve,fbeta_score
from sklearn import datasets, linear_model, preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score,make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
Loading the dataset:
data_set = pd.read_csv('data.csv')
data_set.head(10)
data_set.shape
(863, 15)
So in our data set we have 863 records with 15 features
Preprocessing
#Naming column zero 'Index':
data_set.columns.values[0]="index"
data_set=data_set.set_index("index")
#Check if i have null values in the data set
data_set.values.any()
True
Correlation Analysis
Correlation is a fundamental property of our variables. It's a general purpose measurement that can be taken before any modeling. The higher the correlation between that target variable and the predictor variables is the better performance.
#See data's Correlation
#get correlations of every feature in the dataset
corrmat = data_set.corr()
corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map with diff colors
seecorr = sns.heatmap(data_set[corr_features].corr(),annot=True ,cmap="twilight")
Splitting that data where the result is in y, and the x has all other columns
y =data_set["result"]
x= data_set.drop(columns=["result"])
x
| location | country | gender | age | vis_wuhan | from_wuhan | symptom1 | symptom2 | symptom3 | symptom4 | symptom5 | symptom6 | diff_sym_hos | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | |||||||||||||
| 0 | 104 | 8 | 1 | 66.0 | 1 | 0 | 14 | 31 | 19 | 12 | 3 | 1 | 8 |
| 1 | 101 | 8 | 0 | 56.0 | 0 | 1 | 14 | 31 | 19 | 12 | 3 | 1 | 0 |
| 2 | 137 | 8 | 1 | 46.0 | 0 | 1 | 14 | 31 | 19 | 12 | 3 | 1 | 13 |
| 3 | 116 | 8 | 0 | 60.0 | 1 | 0 | 14 | 31 | 19 | 12 | 3 | 1 | 0 |
| 4 | 116 | 8 | 1 | 58.0 | 0 | 0 | 14 | 31 | 19 | 12 | 3 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 858 | 48 | 3 | 2 | 24.0 | 0 | 0 | 14 | 31 | 19 | 12 | 3 | 1 | 0 |
| 859 | 0 | 0 | 2 | 35.0 | 0 | 0 | 14 | 31 | 19 | 12 | 3 | 1 | 0 |
| 860 | 3 | 1 | 1 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 3 | 1 | 0 |
| 861 | 24 | 9 | 1 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 3 | 1 | 0 |
| 862 | 15 | 27 | 1 | 70.0 | 0 | 0 | 14 | 31 | 19 | 12 | 3 | 1 | 0 |
863 rows × 13 columns
Visualising the dataset
1. Plotting the data
plt.plot(x,y,"r.",markersize=2)
plt.grid()
C:\Users\Samir\anaconda3\envs\lab2\lib\site-packages\matplotlib\cbook\__init__.py:1402: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. ndim = x[:, None].ndim C:\Users\Samir\anaconda3\envs\lab2\lib\site-packages\matplotlib\axes\_base.py:278: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. y = y[:, np.newaxis]
2. Age VS Result: Since the colleration form the heatmap graph was very high
plt.plot(data_set['age'], data_set['result'], 'r.', markersize=4)
plt.title("Age vs. Result")
plt.xlabel('Age')
plt.ylabel("Result")
plt.show()
C:\Users\Samir\anaconda3\envs\lab2\lib\site-packages\matplotlib\axes\_base.py:276: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x = x[:, np.newaxis]
3. symptom1 VS symptom2: Since the colleration form the heatmap graph was very high
plt.plot(data_set['symptom1'], data_set['symptom2'], 'r.', markersize=4)
plt.title("Symptom1 vs. Symptom2")
plt.xlabel("Symptom1")
plt.ylabel("Symptom2")
plt.show()
4. symptom2 VS symptom3: Since the colleration form the heatmap graph was very high
plt.plot(data_set['symptom2'], data_set['symptom3'], 'r.', markersize=4)
plt.title("Symptom2 vers Symptom3")
plt.xlabel("Symptom2")
plt.ylabel("Symptom3")
plt.show()
3. symptom4 VS symptom3: Since the colleration form the heatmap graph was very high
plt.plot(data_set['symptom3'], data_set['symptom4'], 'r.', markersize=4)
plt.title("Symptom3 vers Symptom4")
plt.xlabel("Symptom3")
plt.ylabel("Symptom4")
plt.show()
Conclusion
From scatter plots, there is no strong colleration between any of the symptoms
Therefore, We can't drop any columns!
To have better view about Features's distributions
cols =['symptom3','symptom4','symptom5','symptom6']
for i in range (4):
value_counts = data_set[cols[i]].value_counts(sort=True)
counts = pd.DataFrame(value_counts)
counts_reset = counts.reset_index()
counts_reset.columns = ['unique_value', 'counts']
print(cols[i])
display(counts_reset)
symptom3
| unique_value | counts | |
|---|---|---|
| 0 | 19 | 803 |
| 1 | 8 | 10 |
| 2 | 6 | 7 |
| 3 | 15 | 6 |
| 4 | 5 | 5 |
| 5 | 7 | 5 |
| 6 | 16 | 5 |
| 7 | 14 | 5 |
| 8 | 3 | 4 |
| 9 | 2 | 3 |
| 10 | 13 | 1 |
| 11 | 18 | 1 |
| 12 | 17 | 1 |
| 13 | 0 | 1 |
| 14 | 12 | 1 |
| 15 | 11 | 1 |
| 16 | 1 | 1 |
| 17 | 9 | 1 |
| 18 | 4 | 1 |
| 19 | 10 | 1 |
symptom4
| unique_value | counts | |
|---|---|---|
| 0 | 12 | 844 |
| 1 | 2 | 3 |
| 2 | 0 | 2 |
| 3 | 1 | 2 |
| 4 | 3 | 2 |
| 5 | 7 | 2 |
| 6 | 9 | 2 |
| 7 | 4 | 1 |
| 8 | 5 | 1 |
| 9 | 6 | 1 |
| 10 | 8 | 1 |
| 11 | 10 | 1 |
| 12 | 11 | 1 |
symptom5
| unique_value | counts | |
|---|---|---|
| 0 | 3 | 860 |
| 1 | 0 | 1 |
| 2 | 1 | 1 |
| 3 | 2 | 1 |
symptom6
| unique_value | counts | |
|---|---|---|
| 0 | 1 | 862 |
| 1 | 0 | 1 |
Removing symptom5 & symptom6 since they have only one same value
data_set=data_set.drop(columns=["symptom5","symptom6"])
data_set
| location | country | gender | age | vis_wuhan | from_wuhan | symptom1 | symptom2 | symptom3 | symptom4 | diff_sym_hos | result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | ||||||||||||
| 0 | 104 | 8 | 1 | 66.0 | 1 | 0 | 14 | 31 | 19 | 12 | 8 | 1 |
| 1 | 101 | 8 | 0 | 56.0 | 0 | 1 | 14 | 31 | 19 | 12 | 0 | 0 |
| 2 | 137 | 8 | 1 | 46.0 | 0 | 1 | 14 | 31 | 19 | 12 | 13 | 0 |
| 3 | 116 | 8 | 0 | 60.0 | 1 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| 4 | 116 | 8 | 1 | 58.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 858 | 48 | 3 | 2 | 24.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 1 |
| 859 | 0 | 0 | 2 | 35.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| 860 | 3 | 1 | 1 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| 861 | 24 | 9 | 1 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| 862 | 15 | 27 | 1 | 70.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
863 rows × 12 columns
The data used in this project will help to identify whether a person is going to recover from coronavirus symptoms or not based on some pre-defined standard symptoms.
First Milstone of the project we did the KNN, Naive Bayes, and Logistic Regression classifiers to see whether somebody will recorver or die based on the given dataset.
In evey classifier we will try preprocessing on the data, and we compare them to see the best model from each of the classifiers.
--> I call the Dataset every time to check that it's the orginal data without any normalization or any model affecting it
In this classifier we will use different models as:
One Hot Enconding
Normalisation
Normalisation with One hot Encoding
and see which works better and have best ROC and AUC as well as F2-Score
#one hot encoding
var=pd.get_dummies(x,columns =["gender","location","country"])
x=pd.DataFrame(data=var)
data_set
| location | country | gender | age | vis_wuhan | from_wuhan | symptom1 | symptom2 | symptom3 | symptom4 | diff_sym_hos | result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | ||||||||||||
| 0 | 104 | 8 | 1 | 66.0 | 1 | 0 | 14 | 31 | 19 | 12 | 8 | 1 |
| 1 | 101 | 8 | 0 | 56.0 | 0 | 1 | 14 | 31 | 19 | 12 | 0 | 0 |
| 2 | 137 | 8 | 1 | 46.0 | 0 | 1 | 14 | 31 | 19 | 12 | 13 | 0 |
| 3 | 116 | 8 | 0 | 60.0 | 1 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| 4 | 116 | 8 | 1 | 58.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 858 | 48 | 3 | 2 | 24.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 1 |
| 859 | 0 | 0 | 2 | 35.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| 860 | 3 | 1 | 1 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| 861 | 24 | 9 | 1 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
| 862 | 15 | 27 | 1 | 70.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 |
863 rows × 12 columns
Spliting Data Train/Test :
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1, random_state=10)
Hyperparameter Tuning
l=list()
F=list()
for i in range (1,int(np.sqrt(len(data_set)))):
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=i)
#Fit the model
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
y_pred_proba = knn.predict_proba(x_test)[:,1]
l.append(recall_score(y_test, y_pred))
F.append(fbeta_score(y_test, y_pred, beta=2))
Plotting Recall and F-Beta at different Values of K:
#Generate plot
plt.title('KNN w/ different nb of neighbors')
plt.plot(range (1,int(np.sqrt(len(data_set)))), l, label='Recall')
plt.plot(range (1,int(np.sqrt(len(data_set)))), F, label='Fbeta')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('AUC')
plt.grid()
plt.show
<function matplotlib.pyplot.show(*args, **kw)>
Printing the k
k=F.index(max(F))+1
k
3
Knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=k)
Fitting the Model
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
fbeta1=fbeta_score(y_test, y_pred, beta=2)
Knn_precision1 =precision_score(y_test, y_pred)
Knn_recall1 =recall_score(y_test, y_pred)
print("F2 Score: ",fbeta1)
F2 Score: 0.7017543859649122
Get accuracy. Note: In case of classification algorithms score method represents accuracy.
knn.score(x_test,y_test)
0.9425287356321839
Confusion_matrix
The table that is used to describe the performance of a classification model (or "classifier") on a set of test data for which the true values are known. The Scikit-learn provides facility to calculate confusion matrix using the confusion_matrix method.
#let us get the predictions using the classifier we had fit above
y_pred = knn.predict(x_test)
confusion_matrix(y_test,y_pred)
array([[74, 1],
[ 4, 8]], dtype=int64)
Knn_confusion_matrix=confusion_matrix(y_test,y_pred)
sns.heatmap(Knn_confusion_matrix, annot=True, fmt='g',cmap="twilight")
<matplotlib.axes._subplots.AxesSubplot at 0x23357051f10>
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
| Predicted | 0 | 1 | All |
|---|---|---|---|
| True | |||
| 0 | 74 | 1 | 75 |
| 1 | 4 | 8 | 12 |
| All | 78 | 9 | 87 |
Classification Report
Another important report is the Classification report. It is a text summary of the precision, recall, F1 score for each class. Scikit-learn provides facility to calculate Classification report using the classification_report method.
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.95 0.99 0.97 75
1 0.89 0.67 0.76 12
accuracy 0.94 87
macro avg 0.92 0.83 0.86 87
weighted avg 0.94 0.94 0.94 87
ROC (Reciever Operating Charecteristic)
It is a plot of the true positive rate against the false positive rate for the different possible cutpoints of a diagnostic test.
An ROC curve demonstrates several things:
1) It shows the tradeoff between sensitivity and specificity
2) The closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.(area under the curve kol lma tkon kber kda da ma3nha ahla)
3)The closer the curve comes to the 45-degree diagonal of the ROC space, the less accurate the test.
4) The area under the curve is a measure of test accuracy.
y_pred_proba = knn.predict_proba(x_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc1=roc_auc_score(y_test, y_pred_proba)
thresholds
array([2. , 1. , 0.66666667, 0.33333333, 0. ])
plt.plot(fpr, tpr,'r-',label = 'KNN')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.show()
y =data_set["result"]
x= data_set.drop(columns=["result"])
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))
scaled_values = scaler.fit_transform(x)
x.loc[:,:] = scaled_values
x
| location | country | gender | age | vis_wuhan | from_wuhan | symptom1 | symptom2 | symptom3 | symptom4 | diff_sym_hos | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| index | |||||||||||
| 0 | 0.753623 | 0.242424 | 0.5 | 0.680851 | 1.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.65 |
| 1 | 0.731884 | 0.242424 | 0.0 | 0.574468 | 0.0 | 1.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 |
| 2 | 0.992754 | 0.242424 | 0.5 | 0.468085 | 0.0 | 1.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.90 |
| 3 | 0.840580 | 0.242424 | 0.0 | 0.617021 | 1.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 |
| 4 | 0.840580 | 0.242424 | 0.5 | 0.595745 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 858 | 0.347826 | 0.090909 | 1.0 | 0.234043 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 |
| 859 | 0.000000 | 0.000000 | 1.0 | 0.351064 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 |
| 860 | 0.021739 | 0.030303 | 0.5 | 0.504255 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 |
| 861 | 0.173913 | 0.272727 | 0.5 | 0.504255 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 |
| 862 | 0.108696 | 0.818182 | 0.5 | 0.723404 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 |
863 rows × 11 columns
Splitting Data Train/Test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1, random_state=10)
Hyperparameter Tuning
l=list()
F=list()
for i in range (1,int(np.sqrt(len(data_set)))):
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=i)
#Fit the model
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
y_pred_proba = knn.predict_proba(x_test)[:,1]
l.append(recall_score(y_test, y_pred))
F.append(fbeta_score(y_test, y_pred, beta=2))
Plotting Recall and F-Beat at different Values of K:
#Generate plot
plt.title('KNN w/ different nb of neighbors')
plt.plot(range (1,int(np.sqrt(len(data_set)))), l, label='Recall')
plt.plot(range (1,int(np.sqrt(len(data_set)))), F, label='Fbeta')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('AUC')
plt.grid()
plt.show
<function matplotlib.pyplot.show(*args, **kw)>
Printing the k
k=F.index(max(F))+1
k
3
Knn classifier with k neighbors:
knn = KNeighborsClassifier(n_neighbors=k)
Fitting the Model
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
Get accuracy. Note: In case of classification algorithms score method represents accuracy.
knn.score(x_test,y_test)
0.9770114942528736
Confusion_matrix
#let us get the predictions using the classifier we had fit above
y_pred = knn.predict(x_test)
confusion_matrix(y_test,y_pred)
array([[75, 0],
[ 2, 10]], dtype=int64)
Knn_confusion_matrix=confusion_matrix(y_test,y_pred)
sns.heatmap(Knn_confusion_matrix, annot=True, fmt='g',cmap="twilight")
<matplotlib.axes._subplots.AxesSubplot at 0x233581d54c0>
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
| Predicted | 0 | 1 | All |
|---|---|---|---|
| True | |||
| 0 | 75 | 0 | 75 |
| 1 | 2 | 10 | 12 |
| All | 77 | 10 | 87 |
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.97 1.00 0.99 75
1 1.00 0.83 0.91 12
accuracy 0.98 87
macro avg 0.99 0.92 0.95 87
weighted avg 0.98 0.98 0.98 87
fbeta2=fbeta_score(y_test, y_pred, beta=2)
Knn_precision2 =precision_score(y_test, y_pred)
Knn_recall2 =recall_score(y_test, y_pred)
print("F2 Score: ",fbeta2)
F2 Score: 0.8620689655172415
ROC (Reciever Operating Charecteristic)
y_pred_proba = knn.predict_proba(x_test)[:,1]
fpr_2, tpr_2, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc2=roc_auc_score(y_test, y_pred_proba)
thresholds
array([2. , 1. , 0.66666667, 0.33333333, 0. ])
plt.plot(fpr_2, tpr_2,'r-',label = 'KNN')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.show()
y =data_set["result"]
x= data_set.drop(columns=["result"])
var=pd.get_dummies(x,columns =["gender","location","country"])
x=pd.DataFrame(data=var)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))
scaled_values = scaler.fit_transform(x)
x.loc[:,:] = scaled_values
x
| age | vis_wuhan | from_wuhan | symptom1 | symptom2 | symptom3 | symptom4 | diff_sym_hos | gender_0 | gender_1 | ... | country_24 | country_25 | country_26 | country_27 | country_28 | country_29 | country_30 | country_31 | country_32 | country_33 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | |||||||||||||||||||||
| 0 | 0.680851 | 1.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.65 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 0.574468 | 0.0 | 1.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.468085 | 0.0 | 1.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.90 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 0.617021 | 1.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 0.595745 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 858 | 0.234043 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 859 | 0.351064 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 860 | 0.504255 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 861 | 0.504255 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 862 | 0.723404 | 0.0 | 0.0 | 0.583333 | 1.0 | 1.0 | 1.0 | 0.25 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
863 rows × 184 columns
Splitting Data Train/Test:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1, random_state=10)
Hyperparameter Tuning
l=list()
F=list()
for i in range (1,int(np.sqrt(len(data_set)))):
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=i)
#Fit the model
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
y_pred_proba = knn.predict_proba(x_test)[:,1]
l.append(recall_score(y_test, y_pred))
F.append(fbeta_score(y_test, y_pred, beta=2))
Plotting Recall and F-Beat at different Values of K:
#Generate plot
plt.title('KNN w/ different nb of neighbors')
plt.plot(range (1,int(np.sqrt(len(data_set)))), l, label='Recall')
plt.plot(range (1,int(np.sqrt(len(data_set)))), F, label='Fbeta')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('AUC')
plt.grid()
plt.show
<function matplotlib.pyplot.show(*args, **kw)>
Printing the k
k=F.index(max(F))+1
k
1
knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=k)
Fitting the Model
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
fbeta3=fbeta_score(y_test, y_pred, beta=2)
Knn_precision3 =precision_score(y_test, y_pred)
Knn_recall3=recall_score(y_test, y_pred)
print("F2 Score: ",fbeta3)
F2 Score: 0.6034482758620691
Get accuracy. Note: In case of classification algorithms score method represents accuracy.
knn.score(x_test,y_test)
0.9080459770114943
Confusion_matrix
#let us get the predictions using the classifier we had fit above
y_pred = knn.predict(x_test)
confusion_matrix(y_test,y_pred)
array([[72, 3],
[ 5, 7]], dtype=int64)
Knn_confusion_matrix=confusion_matrix(y_test,y_pred)
sns.heatmap(Knn_confusion_matrix, annot=True, fmt='g',cmap="twilight")
<matplotlib.axes._subplots.AxesSubplot at 0x2335899aa60>
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
| Predicted | 0 | 1 | All |
|---|---|---|---|
| True | |||
| 0 | 72 | 3 | 75 |
| 1 | 5 | 7 | 12 |
| All | 77 | 10 | 87 |
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.94 0.96 0.95 75
1 0.70 0.58 0.64 12
accuracy 0.91 87
macro avg 0.82 0.77 0.79 87
weighted avg 0.90 0.91 0.90 87
ROC (Reciever Operating Charecteristic)
y_pred_proba = knn.predict_proba(x_test)[:,1]
fpr_3, tpr_3, thresholds_3 = roc_curve(y_test, y_pred_proba)
roc_auc3=roc_auc_score(y_test, y_pred_proba)
thresholds_3
array([2., 1., 0.])
plt.plot(fpr_3, tpr_3,'r-',label = 'KNN')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.show()
Comparison between the 3 models with respect to ROC-AUC curve
fig = plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, label='ROC Curve 1 (AUC = %0.2f)' % (roc_auc1))
plt.plot(fpr_2, tpr_2, label='ROC Curve 2 (AUC = %0.2f)' % (roc_auc1))
plt.plot(fpr_3, tpr_3, label='ROC Curve 3 (AUC = %0.2f)' % (roc_auc1))
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Random Classifier')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='green')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.grid()
plt.show()
fig = plt.figure(figsize=(10,10))
plt.plot(["One_hot encoding", "Normalization", "Both Together"], [fbeta1, fbeta2, fbeta3],marker='x')
plt.ylabel('F-Beta')
plt.grid()
plt.show()
Comparison between the 3 models with respect to Recall Perscrion AUC
fig = plt.figure(figsize=(15,10))
plt.plot(['One Hot Encoding', 'Normalization', 'Both'],[roc_auc1,roc_auc2,roc_auc3],marker='x',label='AUC')
plt.plot(['One Hot Encoding', 'Normalization', 'Both'],[Knn_recall1,Knn_recall2,Knn_recall3],marker='x',label='Recall')
plt.plot(['One Hot Encoding', 'Normalization', 'Both'],[Knn_precision1,Knn_precision2,Knn_precision3],marker='x',label='precision')
plt.title('AUC,Recall,Perscrion')
plt.legend()
plt.grid()
plt.show()
KNN Conclusion
Based on the pervious graph since we will compare with F-Beta and ROC: the best model is With Normalization Only.
we use Gaussian NB Classifier. We first split the data to train, validation and test sets as 70%, 15% and 15% respectively.
y =data_set["result"]
x= data_set.drop(columns=["result"])
x
| location | country | gender | age | vis_wuhan | from_wuhan | symptom1 | symptom2 | symptom3 | symptom4 | diff_sym_hos | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| index | |||||||||||
| 0 | 104 | 8 | 1 | 66.0 | 1 | 0 | 14 | 31 | 19 | 12 | 8 |
| 1 | 101 | 8 | 0 | 56.0 | 0 | 1 | 14 | 31 | 19 | 12 | 0 |
| 2 | 137 | 8 | 1 | 46.0 | 0 | 1 | 14 | 31 | 19 | 12 | 13 |
| 3 | 116 | 8 | 0 | 60.0 | 1 | 0 | 14 | 31 | 19 | 12 | 0 |
| 4 | 116 | 8 | 1 | 58.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 858 | 48 | 3 | 2 | 24.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 |
| 859 | 0 | 0 | 2 | 35.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 |
| 860 | 3 | 1 | 1 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 |
| 861 | 24 | 9 | 1 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 |
| 862 | 15 | 27 | 1 | 70.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 |
863 rows × 11 columns
#one hot encoding
var = pd.get_dummies(data_set,columns =["gender","location","country"])
data_set = pd.DataFrame(data=var)
data_set
| age | vis_wuhan | from_wuhan | symptom1 | symptom2 | symptom3 | symptom4 | diff_sym_hos | result | gender_0 | ... | country_24 | country_25 | country_26 | country_27 | country_28 | country_29 | country_30 | country_31 | country_32 | country_33 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | |||||||||||||||||||||
| 0 | 66.0 | 1 | 0 | 14 | 31 | 19 | 12 | 8 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 56.0 | 0 | 1 | 14 | 31 | 19 | 12 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 46.0 | 0 | 1 | 14 | 31 | 19 | 12 | 13 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 60.0 | 1 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 58.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 858 | 24.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 859 | 35.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 860 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 861 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 862 | 70.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
863 rows × 185 columns
Splitting Train/Test:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15, random_state = 10, shuffle=True)
Model Traning
classifier1 = GaussianNB()
classifier1.fit(x_train, y_train)
y_pred_prob = classifier1.predict_proba(x_test)
y_pred_prob = y_pred_prob[:, 1]
y_pred = classifier1.predict(x_test)
Confusion Matrix
classifier1 = confusion_matrix(y_test, y_pred)
f,ax = plt.subplots(figsize=(5, 4))
sns.heatmap(classifier1, annot=True, linewidths=0.5, fmt= '.0f',cmap="twilight")
plt.show()
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.90 0.96 0.93 112
1 0.60 0.33 0.43 18
accuracy 0.88 130
macro avg 0.75 0.65 0.68 130
weighted avg 0.86 0.88 0.86 130
NB_fpr1, NB_tpr1, thresholds = roc_curve(y_test, y_pred_prob)
NB_fbeta1 = fbeta_score(y_test, y_pred, beta=2)
NB_precision1 =precision_score(y_test, y_pred)
NB_AUC1=roc_auc_score(y_test, y_pred_prob)
NB_recall1 =recall_score(y_test, y_pred)
print("F2 Score: ", NB_fbeta1)
F2 Score: 0.36585365853658536
print("Area under ROC: ",NB_AUC1)
plt.plot(NB_fpr1, NB_tpr1,'r-',label = 'Naïve Bayes')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.grid()
plt.show()
Area under ROC: 0.8973214285714286
There is two hypterparamters in gaussian prior and var_smoothing
Splitting Train/Test:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15, random_state = 10,shuffle=True)
Validation to Get Best Hyperparameters:
classifier2 = GaussianNB()
smooth_range = np.logspace(9,-9, num=100)
probs = [[0.15,0.85],[0.2,0.8],[0.25,0.75],[0.3,0.7],[0.5,0.5],[0.0,1.0],[0.05,0.95],[0.1,0.9],[0.35,0.65],[0.4,0.6],[0.45,0.55],[0.55,0.45],[0.6,0.4],[0.65,0.35],[0.7,0.3],[0.75,0.25],[0.8,0.2],[0.85,0.15],[0.9,0.1],[0.95,0.05],[1,0]]
param_grid = [
{
'priors' : probs,
'var_smoothing' : smooth_range
}
]
fbeta = make_scorer(fbeta_score, beta=2)
grid = GridSearchCV(classifier2, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1, scoring=fbeta)
bestclassifier = grid.fit(x_train, y_train)
classifier2 = bestclassifier.best_estimator_
Fitting 5 folds for each of 2100 candidates, totalling 10500 fits
Training the Model with the best hyperparameter:
classifier2.fit(x_train,y_train)
GaussianNB(priors=[0.55, 0.45], var_smoothing=0.06579332246575682)
y_pred_prob = classifier2.predict_proba(x_test)
y_pred_prob = y_pred_prob[:, 1]
y_pred = classifier2.predict(x_test)
Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
f,ax = plt.subplots(figsize=(5, 4))
sns.heatmap(cm, annot=True, linewidths=0.5, fmt= '.0f',cmap="twilight")
plt.show()
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.95 0.95 0.95 112
1 0.68 0.72 0.70 18
accuracy 0.92 130
macro avg 0.82 0.83 0.83 130
weighted avg 0.92 0.92 0.92 130
NB_fpr2, NB_tpr2, thresholds = roc_curve(y_test, y_pred_prob)
NB_fbeta2 = fbeta_score(y_test, y_pred, beta=2)
NB_precision2 =precision_score(y_test, y_pred)
NB_AUC2=roc_auc_score(y_test, y_pred_prob)
NB_recall2 =recall_score(y_test, y_pred)
print("F2 Score: ", NB_fbeta2)
F2 Score: 0.7142857142857143
print("Area under ROC: ",NB_AUC2)
plt.plot(NB_fpr2, NB_tpr2,'r-',label = 'Naive Bayes')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.grid()
plt.show()
Area under ROC: 0.8953373015873016
Comparison between the 2 models with respect to ROC-AUC curve
fig = plt.figure(figsize=(10,10))
plt.plot(NB_fpr1, NB_tpr1, label='ROC Curve 1 (AUC = %0.2f)' % (NB_AUC1))
plt.plot(NB_fpr2, NB_tpr2, label='ROC Curve 2 (AUC = %0.2f)' % (NB_AUC2))
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Random Classifier')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='green', label='Perfect Classifier')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend()
plt.grid()
plt.show()
fig = plt.figure(figsize=(15,10))
plt.plot(["W/out Hyperparamter tuning", "W/ Hyperparamter tuning"], [NB_fbeta1, NB_fbeta2],marker='x',color='green')
plt.ylabel('F-Beta')
plt.grid()
plt.show()
Comparison between the 3 models with respect to Recall Perscrion AUC
fig = plt.figure(figsize=(15,10))
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning'],[NB_recall1,NB_recall2],marker='x',label='Recall')
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning'],[NB_precision1,NB_precision2],marker='x',label='precision')
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning'],[NB_AUC1,NB_AUC2],marker='x',label='AUC')
plt.title('Recall, Perscrion, AUC')
plt.legend()
plt.grid()
plt.show()
Naïve Bayes Conclusion
Based on the pervious graph since we will compare with F-Beta: the best model is With Hyperparamter Tuning.
-EXTRA PART-
data_set = pd.read_csv('data.csv')
data_set.head(10)
data_set.shape
(863, 15)
#Naming column zero 'Index':
data_set.columns.values[0]="index"
data_set=data_set.set_index("index")
data_set=data_set.drop(columns=["symptom5","symptom6"])
Y =data_set["result"]
X= data_set.drop(columns=["result"])
def calculate_prior(data_set, Y):
classes = sorted(list(data_set[Y].unique()))
prior = []
for i in classes:
prior.append(len(data_set[data_set[Y]==i])/len(data_set))
return prior
def calculate_likelihood_gaussian(data_set, feat_name, feat_val, Y, label):
feat = list(data_set.columns)
data_set = data_set[data_set[Y]==label]
mean, std = data_set[feat_name].mean(), data_set[feat_name].std()
p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-((feat_val-mean)**2 / (2 * std**2 )))
return p_x_given_y
def naive_bayes_gaussian(data_set, X, Y):
# get feature names
features = list(data_set.columns)[:-1]
# calculate prior
prior = calculate_prior(data_set, Y)
Y_pred = []
# loop over every data sample
for x in X:
# calculate likelihood
labels = sorted(list(data_set[Y].unique()))
likelihood = [1]*len(labels)
for j in range(len(labels)):
for i in range(len(features)):
likelihood[j] *= calculate_likelihood_gaussian(data_set, features[i], x[i], Y, labels[j])
# calculate posterior probability (numerator only)
post_prob = [1]*len(labels)
for j in range(len(labels)):
post_prob[j] = likelihood[j] * prior[j]
Y_pred.append(np.argmax(post_prob))
return np.array(Y_pred)
train, test = train_test_split(data_set, test_size=0.15, random_state=10)
X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values
Y_pred = naive_bayes_gaussian(train, X = X_test, Y="result")
print(confusion_matrix(Y_test, Y_pred))
print("f1-score: ",f1_score(Y_test, Y_pred))
print("recall score: ",recall_score(Y_test, Y_pred))
print("precision score: ",precision_score(Y_test, Y_pred))
fpr, tpr, thresholds = roc_curve(Y_test,Y_pred)
[[108 4] [ 12 6]] f1-score: 0.42857142857142855 recall score: 0.3333333333333333 precision score: 0.6
fbeta=f1_score(Y_test, Y_pred)
recall=recall_score(Y_test, Y_pred)
precision=precision_score(Y_test, Y_pred)
print("f1-score: ",f1_score(Y_test, Y_pred))
f1-score: 0.42857142857142855
NB_confusion_matrix=confusion_matrix(Y_test,Y_pred)
sns.heatmap(NB_confusion_matrix, annot=True, fmt='g',cmap="twilight")
<matplotlib.axes._subplots.AxesSubplot at 0x23358d30490>
print (classification_report(Y_test, Y_pred))
precision recall f1-score support
0 0.90 0.96 0.93 112
1 0.60 0.33 0.43 18
accuracy 0.88 130
macro avg 0.75 0.65 0.68 130
weighted avg 0.86 0.88 0.86 130
fpr, tpr, thresholds = roc_curve(Y_test,Y_pred)
auc = roc_auc_score(Y_test, Y_pred)
print (auc)
0.6488095238095237
plt.plot(fpr, tpr, label = 'NaiveBayes (AUC = %0.5f)' % auc)
plt.plot( [0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.grid()
plt.show()
We first split the data to train, validation and test sets as 70%, 15% and 15% respectively.
y =data_set["result"]
x= data_set.drop(columns=["result"])
Spliting Data Train/Test :
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.15,random_state=10, shuffle=True)
Now: We will use the StandardScaler to scale the features before using the GridSearchCV
sc = StandardScaler()
x_train1 = sc.fit_transform(x_train)
x_test1 = sc.transform(x_test)
Traning the Model
Logistic_Regression_Classifier = linear_model.LogisticRegression()
Logistic_Regression_Classifier.fit(x_train1, y_train)
y_pred_prob = Logistic_Regression_Classifier.predict_proba(x_test1)
y_pred_prob = y_pred_prob[:,1]
y_pred = Logistic_Regression_Classifier.predict(x_test1)
Confusion_matrix
#let us get the predictions using the classifier we had fit above
y_pred = Logistic_Regression_Classifier.predict(x_test1)
confusion_matrix(y_test,y_pred)
array([[110, 2],
[ 7, 11]], dtype=int64)
LR_confusion_matrix=confusion_matrix(y_test,y_pred)
sns.heatmap(LR_confusion_matrix, annot=True, fmt='g',cmap="twilight")
<matplotlib.axes._subplots.AxesSubplot at 0x233581db400>
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
| Predicted | 0 | 1 | All |
|---|---|---|---|
| True | |||
| 0 | 110 | 2 | 112 |
| 1 | 7 | 11 | 18 |
| All | 117 | 13 | 130 |
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.94 0.98 0.96 112
1 0.85 0.61 0.71 18
accuracy 0.93 130
macro avg 0.89 0.80 0.84 130
weighted avg 0.93 0.93 0.93 130
ROC (Reciever Operating Charecteristic)
y_pred_proba = Logistic_Regression_Classifier.predict_proba(x_test1)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
AUC1=roc_auc_score( y_test, y_pred_prob)
fbeta1 = fbeta_score(y_test, y_pred, beta=2)
R1 =recall_score(y_test, y_pred)
P1 =precision_score(y_test, y_pred)
print("F2 Score: ", fbeta1)
F2 Score: 0.6470588235294118
plt.plot(fpr, tpr,'r-',label = 'Logisitic Regression')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
print("AUC on test data = %f" % AUC1)
plt.show()
AUC on test data = 0.932540
Validation to Get the Best Hyperparameters
y = data_set['result']
X = data_set.drop(columns=['result'])
Spliting Data Train/Test
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.15,random_state=10, shuffle=True)
scal = StandardScaler()
x_train1 = scal.fit_transform(x_train)
x_test1 = scal.transform(x_test)
Logistic_Regression_Classifier2 = LogisticRegression( max_iter=3000)
weights= np.linspace(0.0,0.99,100)
W=[{0:x, 1:1.0-x} for x in weights]
param_grid = [
{
'C' : np.logspace(-4, 4, 50),
'class_weight' : W
}
]
fbeta2 = make_scorer(fbeta_score, beta=2)
Logistic_Regression_Classifier2 = GridSearchCV(Logistic_Regression_Classifier2, param_grid = param_grid, cv =5, verbose=True, n_jobs=-1, scoring=fbeta2)
best_clf = Logistic_Regression_Classifier2.fit(x_train1,y_train)
Logistic_Regression_Classifier2=Logistic_Regression_Classifier2.best_estimator_
Fitting 5 folds for each of 5000 candidates, totalling 25000 fits
Traning the Model
Logistic_Regression_Classifier2.fit(x_train1, y_train)
LogisticRegression(C=0.5689866029018293, class_weight={0: 0.26, 1: 0.74},
max_iter=3000)
y_pred = Logistic_Regression_Classifier2.predict(x_test1)
y_pred_prob = Logistic_Regression_Classifier2.predict_proba(x_test1)
y_pred_prob =y_pred_prob[:,1]
Confusion Matrix
y_pred = Logistic_Regression_Classifier2.predict(x_test1)
confusion_matrix(y_test,y_pred)
array([[108, 4],
[ 6, 12]], dtype=int64)
LR_confusion_matrix=confusion_matrix(y_test,y_pred)
sns.heatmap(LR_confusion_matrix, annot=True, fmt='g',cmap="twilight")
<matplotlib.axes._subplots.AxesSubplot at 0x23354b72340>
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
| Predicted | 0 | 1 | All |
|---|---|---|---|
| True | |||
| 0 | 108 | 4 | 112 |
| 1 | 6 | 12 | 18 |
| All | 114 | 16 | 130 |
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.95 0.96 0.96 112
1 0.75 0.67 0.71 18
accuracy 0.92 130
macro avg 0.85 0.82 0.83 130
weighted avg 0.92 0.92 0.92 130
y_pred_proba = Logistic_Regression_Classifier2.predict_proba(x_test1)[:,1]
fpr2, tpr2, thresholds = roc_curve(y_test, y_pred_proba)
AUC2=roc_auc_score(y_test, y_pred_proba)
fbeta2 = fbeta_score(y_test, y_pred, beta=2)
R2 =recall_score(y_test, y_pred)
P2 =precision_score(y_test, y_pred)
print("F2 Score: ", fbeta2)
F2 Score: 0.6818181818181819
plt.plot(fpr2, tpr2,'r-',label = 'Logisitic Regression')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
AUC2=roc_auc_score(y_test, y_pred_prob)
print("AUC on test data = %f" % AUC2)
plt.grid()
plt.show()
AUC on test data = 0.933532
y =data_set["result"]
x= data_set.drop(columns=["result"])
var=pd.get_dummies(x,columns =["gender","location","country"])
x=pd.DataFrame(data=var)
x
| age | vis_wuhan | from_wuhan | symptom1 | symptom2 | symptom3 | symptom4 | diff_sym_hos | gender_0 | gender_1 | ... | country_24 | country_25 | country_26 | country_27 | country_28 | country_29 | country_30 | country_31 | country_32 | country_33 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | |||||||||||||||||||||
| 0 | 66.0 | 1 | 0 | 14 | 31 | 19 | 12 | 8 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 56.0 | 0 | 1 | 14 | 31 | 19 | 12 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 46.0 | 0 | 1 | 14 | 31 | 19 | 12 | 13 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 60.0 | 1 | 0 | 14 | 31 | 19 | 12 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 58.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 858 | 24.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 859 | 35.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 860 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 861 | 49.4 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 862 | 70.0 | 0 | 0 | 14 | 31 | 19 | 12 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
863 rows × 184 columns
Spltting data Train/Test
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.15,random_state=10, shuffle=True)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
Validation to get the best hyperparmeter:
Logistic_Regression_Classifier3 = LogisticRegression( max_iter=3000)
weights= np.linspace(0.0,0.99,100)
W=[{0:x, 1:1.0-x} for x in weights]
param_grid = [
{
'C' : np.logspace(-4, 4, 50),
'class_weight' : W
}
]
fbeta3 = make_scorer(fbeta_score, beta=2)
Logistic_Regression_Classifier3 = GridSearchCV(Logistic_Regression_Classifier3 , param_grid = param_grid, cv =5, verbose=10, n_jobs=-1, scoring=fbeta3)
best_clf =Logistic_Regression_Classifier3 .fit(x_train,y_train)
Logistic_Regression_Classifier3 =Logistic_Regression_Classifier3.best_estimator_
Fitting 5 folds for each of 5000 candidates, totalling 25000 fits
Traning the Model
Logistic_Regression_Classifier3.fit(x_train, y_train)
LogisticRegression(C=51.79474679231202, class_weight={0: 0.02, 1: 0.98},
max_iter=3000)
y_pred = Logistic_Regression_Classifier3.predict(x_test)
y_pred_prob = Logistic_Regression_Classifier3.predict_proba(x_test)
y_pred_prob =y_pred_prob[:,1]
Confusion Matrix
confusion_matrix(y_test,y_pred)
array([[109, 3],
[ 1, 17]], dtype=int64)
LR_confusion_matrix=confusion_matrix(y_test,y_pred)
sns.heatmap(LR_confusion_matrix, annot=True, fmt='g',cmap="twilight")
<matplotlib.axes._subplots.AxesSubplot at 0x23358e0be50>
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
| Predicted | 0 | 1 | All |
|---|---|---|---|
| True | |||
| 0 | 109 | 3 | 112 |
| 1 | 1 | 17 | 18 |
| All | 110 | 20 | 130 |
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.99 0.97 0.98 112
1 0.85 0.94 0.89 18
accuracy 0.97 130
macro avg 0.92 0.96 0.94 130
weighted avg 0.97 0.97 0.97 130
y_pred_proba = Logistic_Regression_Classifier3.predict_proba(x_test)[:,1]
fpr3, tpr3, thresholds = roc_curve(y_test, y_pred_proba)
AUC3=roc_auc_score(y_test, y_pred_proba)
fbeta3 = fbeta_score(y_test, y_pred, beta=2)
R3 =recall_score(y_test, y_pred)
P3 =precision_score(y_test, y_pred)
print("F2 Score: ", fbeta3)
F2 Score: 0.9239130434782609
plt.plot(fpr3, tpr3,'r-',label = 'Logisitic Regression')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
auc3=roc_auc_score(y_test, y_pred_prob)
print("AUC on test data = %f" % AUC3)
plt.grid()
plt.show()
AUC on test data = 0.980159
Comparison between the 3 models with respect to ROC-AUC curve
fig = plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, label='ROC Curve_tuning (AUC = %0.2f)' % (AUC1))
plt.plot(fpr2, tpr2, label='ROC Curve_implem (AUC = %0.2f)' % (AUC2))
plt.plot(fpr3, tpr3, label='ROC Curve_sklearn (AUC = %0.2f)' % (AUC3))
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Random Classifier')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='green', label='Perfect Classifier')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend()
plt.grid()
plt.show()
fig = plt.figure(figsize=(15,10))
plt.plot(["W/out Hyperparamter tuning", "W/ Hyperparamter tuning", "One Hot Encoding"], [fbeta1, fbeta2,fbeta3],marker='x')
plt.ylabel('F-Beta')
plt.grid()
plt.show()
Comparison between the 3 models with respect to Recall Perscrion AUC
fig = plt.figure(figsize=(15,10))
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning' ,'One Hot Encoding'],[R1,R2,R3],marker='x',label='Recall')
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning' ,'One Hot Encoding'],[P1,P2,P3],marker='x',label='precision')
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning' ,'One Hot Encoding'],[AUC1,AUC2,AUC3],marker='x',label='AUC')
plt.title('Recall, Perscrion, AUC')
plt.legend()
plt.grid()
plt.show()
Conclusion:
One Hot encoding model affects the LR; as shown in the previous graphs. class_weight since the “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
In the second Milstone of the project we did the Decision Tree, and SVM classifiers to see whether somebody will recorver or die based on the given dataset.
In evey classifier we will try preprocessing on the data, and we compare them to see the best model from each of the classifiers.
In this classifier we will use different models as:
Without Hyperprameter tuning
With Hyperprameter tuning
and see which works better and have best ROC and AUC as well as F2-Score
y =data_set["result"]
x= data_set.drop(columns=["result"])
Spliting Data Train/Test :
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1,random_state=88, shuffle=True)
Decision Tree Classifier with criterion entropy
clf_entropy = DecisionTreeClassifier(criterion='entropy',random_state=42)
Training the Model:
clf_entropy.fit(x_train, y_train)
DecisionTreeClassifier(criterion='entropy', random_state=42)
Visualizing the Tree
fig = plt.figure(figsize=(60,55))
_ = tree.plot_tree(clf_entropy,feature_names=x.columns,class_names=["0","1"],filled=True)
y_pred_prob = clf_entropy.predict_proba(x_test)
y_pred_prob = y_pred_prob[:, 1]
y_pred = clf_entropy.predict(x_test)
Confusion Matrix
clf_entropy = confusion_matrix(y_test, y_pred)
f,ax = plt.subplots(figsize=(5, 4))
sns.heatmap(clf_entropy, annot=True, linewidths=0.5, fmt= '.0f',cmap="twilight")
plt.show()
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.99 0.99 0.99 69
1 0.94 0.94 0.94 18
accuracy 0.98 87
macro avg 0.96 0.96 0.96 87
weighted avg 0.98 0.98 0.98 87
DT_fpr1, DT_tpr1, thresholds = roc_curve(y_test, y_pred_prob)
DT_fbeta1 = fbeta_score(y_test, y_pred, beta=2)
DT_precision1 =precision_score(y_test, y_pred)
DT_AUC1=roc_auc_score(y_test, y_pred_prob)
DT_recall1 =recall_score(y_test, y_pred)
print("F2 Score: ", DT_fbeta1)
F2 Score: 0.9444444444444444
print("Area under ROC: ",DT_AUC1)
plt.plot(DT_fpr1, DT_tpr1,'r-',label = 'Decision Tree-entropy')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.grid()
plt.show()
Area under ROC: 0.9649758454106281
Decision Tree Classifier with criterion gini index:
clf_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
clf_gini.fit(x_train, y_train)
DecisionTreeClassifier(random_state=42)
Visualizing the Tree
fig = plt.figure(figsize=(60,55))
_ = tree.plot_tree(clf_gini,feature_names=x.columns,class_names=["0","1"],filled=True)
y_pred_prob = clf_gini.predict_proba(x_test)
y_pred_prob = y_pred_prob[:, 1]
y_pred = clf_gini.predict(x_test)
Confusion Matrix
clf_gini = confusion_matrix(y_test, y_pred)
f,ax = plt.subplots(figsize=(5, 4))
sns.heatmap(clf_gini, annot=True, linewidths=0.5, fmt= '.0f',cmap="twilight")
plt.show()
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.96 0.97 0.96 69
1 0.88 0.83 0.86 18
accuracy 0.94 87
macro avg 0.92 0.90 0.91 87
weighted avg 0.94 0.94 0.94 87
DT_fpr2, DT_tpr2, thresholds = roc_curve(y_test, y_pred_prob)
DT_fbeta2 = fbeta_score(y_test, y_pred, beta=2)
DT_precision2 =precision_score(y_test, y_pred)
DT_AUC2=roc_auc_score(y_test, y_pred_prob)
DT_recall2 =recall_score(y_test, y_pred)
print("F2 Score: ", DT_fbeta1)
F2 Score: 0.9444444444444444
print("Area under ROC: ",DT_AUC2)
plt.plot(DT_fpr2, DT_tpr2,'r-',label = 'Decision Tree_gini')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.grid()
plt.show()
Area under ROC: 0.9021739130434784
Concluded that: We already choose the F2-score for the main comparison since in our case in a medical dataset,we want to decrease the number of False Negative therefore we scored based on the F2-score, that's why in both case the entropy and gini index have the same F2-scroe. However there is a slightly difference between them in the recall and precision, that's small difference shows that the entropy is better since both the recall and precision are high= 0.94 as well as the ROC cover is greater = 0.9649
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1,random_state=88, shuffle=True)
DT_clf= DecisionTreeClassifier()
max_depth=[3,5,8,13,21,34]
min_samples_split=[3,5,8,13,21,34]
min_samples_leaf=[10,11,12,13,14,15,16]
param_grid=[ {'max_depth':max_depth,
'min_samples_split':min_samples_split,
'min_samples_leaf':min_samples_leaf,
}
]
fbeta = make_scorer(fbeta_score, beta=2)
DT_clf=GridSearchCV(DT_clf,param_grid = param_grid, cv =5, verbose=True, n_jobs=-1, scoring=fbeta)
best_clf = DT_clf.fit(x_train,y_train)
DT_clf=DT_clf.best_estimator_
Fitting 5 folds for each of 252 candidates, totalling 1260 fits
Training the Model with the best hyperprameter:
DT_clf.fit(x_train, y_train)
DecisionTreeClassifier(max_depth=5, min_samples_leaf=10, min_samples_split=34)
fig = plt.figure(figsize=(60,55))
_ = tree.plot_tree(DT_clf,feature_names=x.columns,class_names=["0","1"],filled=True)
y_pred_prob = DT_clf.predict_proba(x_test)
y_pred_prob = y_pred_prob[:, 1]
y_pred = DT_clf.predict(x_test)
Confusion Matrix
DT_clf = confusion_matrix(y_test, y_pred)
f,ax = plt.subplots(figsize=(5, 4))
sns.heatmap(DT_clf, annot=True, linewidths=0.5, fmt= '.0f',cmap="twilight")
plt.show()
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.97 0.97 0.97 69
1 0.89 0.89 0.89 18
accuracy 0.95 87
macro avg 0.93 0.93 0.93 87
weighted avg 0.95 0.95 0.95 87
ROC (Reciever Operating Charecteristic)
DT_fpr3, DT_tpr3, thresholds = roc_curve(y_test, y_pred_prob)
DT_fbeta3 = fbeta_score(y_test, y_pred, beta=2)
DT_precision3 =precision_score(y_test, y_pred)
DT_AUC3=roc_auc_score(y_test, y_pred_prob)
DT_recall3 =recall_score(y_test, y_pred)
print("F2 Score: ", DT_fbeta2)
F2 Score: 0.8426966292134833
print("Area under ROC: ",DT_AUC3)
plt.plot(DT_fpr3, DT_tpr3,'r-',label = 'DT w/ Hyperparameter')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.grid()
plt.show()
Area under ROC: 0.9484702093397746
Comparison between the 2 models with respect to ROC-AUC curve
Without hyperparamter tuning we will compare with the entropy since it's already shown that the entropy has better recall and presion as well as roc (mentioned above).
fig = plt.figure(figsize=(10,10))
plt.plot(DT_fpr1,DT_tpr1, label='ROC Curve 1 (AUC = %0.2f)' % (DT_AUC1))
plt.plot(DT_fpr3, DT_tpr3, label='ROC Curve 2 (AUC = %0.2f)' % (DT_AUC3))
plt.plot([0, 1], [0, 1], linestyle='--', color='green',label='Random Classifier')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='green', label='Perfect Classifier')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend()
plt.grid()
plt.show()
fig = plt.figure(figsize=(15,10))
plt.plot(["W/out Hyperparamter tuning", "W/ Hyperparamter tuning"], [DT_fbeta1,DT_fbeta3],marker='x',color='green')
plt.ylabel('F-Beta')
plt.grid()
plt.show()
We will try the classifer once without hyperparamter tuning and once with the hyperparamter tunung
The hyperparamters are:
Kernels: The main function of the kernel is to take low dimensional input space and transform it into a higher-dimensional space. It is mostly useful in non-linear separation problem.
C (Regularisation): C is the penalty parameter, which represents misclassification or error term. The misclassification or error term tells the SVM optimisation how much error is bearable. This is how you can control the trade-off between decision boundary and misclassification term.
Gamma: It defines how far influences the calculation of plausible line of separation, when gamma is higher, nearby points will have high influence; low gamma means far away points also be considered to get the decision boundary
y =data_set["result"]
x= data_set.drop(columns=["result"])
Spliting Data Train/Test:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1,random_state=10, shuffle=True)
Now: We will use the StandardScaler to scale the features before using the GridSearchCV
sc = StandardScaler()
x_train1 = sc.fit_transform(x_train)
x_test1 = sc.transform(x_test)
Traning the Model:
SVM_Classifier = SVC(max_iter=30000,probability=True,kernel= 'poly', verbose=True,C = 1000.0)
SVM_Classifier.fit(x_train1, y_train)
y_pred_prob = SVM_Classifier.predict_proba(x_test1)
y_pred_prob = y_pred_prob[:,1]
y_pred =SVM_Classifier.predict(x_test1)
[LibSVM]
Confusion_matrix
y_pred = SVM_Classifier.predict(x_test1)
confusion_matrix(y_test,y_pred)
array([[73, 2],
[ 1, 11]], dtype=int64)
CM=confusion_matrix(y_test,y_pred)
sns.heatmap(CM, annot=True, fmt='g',cmap="twilight")
<matplotlib.axes._subplots.AxesSubplot at 0x233590feeb0>
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
| Predicted | 0 | 1 | All |
|---|---|---|---|
| True | |||
| 0 | 73 | 2 | 75 |
| 1 | 1 | 11 | 12 |
| All | 74 | 13 | 87 |
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.99 0.97 0.98 75
1 0.85 0.92 0.88 12
accuracy 0.97 87
macro avg 0.92 0.95 0.93 87
weighted avg 0.97 0.97 0.97 87
ROC (Reciever Operating Charecteristic)
y_pred_proba = SVM_Classifier.predict_proba(x_test1)[:,1]
fpr1, tpr1, thresholds = roc_curve(y_test, y_pred_proba)
SVM_AUC1=roc_auc_score( y_test, y_pred_prob)
SVM_fbeta1 = fbeta_score(y_test, y_pred, beta=2)
SVM_R1 =recall_score(y_test, y_pred)
SVM_P1 =precision_score(y_test, y_pred)
print("F2 Score: ", SVM_fbeta1)
F2 Score: 0.9016393442622951
plt.plot(fpr1, tpr1,'r-',label = 'SVM_w/out_Hyperparamter tuning')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
print("AUC on test data = %f" % SVM_AUC1)
plt.show()
AUC on test data = 0.952222
y = data_set['result']
X = data_set.drop(columns=['result'])
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1,random_state=10, shuffle=True)
scal = StandardScaler()
x_train1 = scal.fit_transform(x_train)
x_test1 = scal.transform(x_test)
SVM_Classifier2 = SVC(probability=True,kernel= 'linear')
weights= np.linspace(0.0,0.99,10)
W=[{0:x, 1:1.0-x} for x in weights]
param_grid = [
{
'C' : np.logspace(-4, 2, 50),
'class_weight' : W,
'degree':[2,3,4,5,6,7,8,9,10],
'gamma' :['scale', 'auto']
}
]
fbeta2 = make_scorer(fbeta_score, beta=2)
SVM_Classifier2 = GridSearchCV(SVM_Classifier2, param_grid = param_grid, cv =5, verbose=10, n_jobs=-1, scoring=fbeta2)
best_clf = SVM_Classifier2.fit(x_train1,y_train)
SVM_Classifier2=SVM_Classifier2.best_estimator_
Fitting 5 folds for each of 9000 candidates, totalling 45000 fits
Traning the Model with the best hyperpramater
SVM_Classifier2.fit(x_train1, y_train)
SVC(C=0.1151395399326447, class_weight={0: 0.22, 1: 0.78}, degree=2,
kernel='linear', probability=True)
y_pred = SVM_Classifier2.predict(x_test1)
y_pred_prob = SVM_Classifier2.predict_proba(x_test1)
y_pred_prob =y_pred_prob[:,1]
Confusion Matrix
y_pred = SVM_Classifier2.predict(x_test1)
confusion_matrix(y_test,y_pred)
array([[70, 5],
[ 2, 10]], dtype=int64)
svm_cm=confusion_matrix(y_test,y_pred)
sns.heatmap(svm_cm, annot=True, fmt='g',cmap="twilight")
<matplotlib.axes._subplots.AxesSubplot at 0x233595ec700>
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
| Predicted | 0 | 1 | All |
|---|---|---|---|
| True | |||
| 0 | 70 | 5 | 75 |
| 1 | 2 | 10 | 12 |
| All | 72 | 15 | 87 |
Classification Report
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.97 0.93 0.95 75
1 0.67 0.83 0.74 12
accuracy 0.92 87
macro avg 0.82 0.88 0.85 87
weighted avg 0.93 0.92 0.92 87
ROC (Reciever Operating Charecteristic)
y_pred_proba = SVM_Classifier2.predict_proba(x_test1)[:,1]
fpr2, tpr2, thresholds = roc_curve(y_test, y_pred_proba)
SVM_AUC2=roc_auc_score(y_test, y_pred_proba)
SVM_fbeta2 = fbeta_score(y_test, y_pred, beta=2)
SVM_R2 =recall_score(y_test, y_pred)
SVM_P2 =precision_score(y_test, y_pred)
print("F2 Score: ", SVM_fbeta2)
F2 Score: 0.7936507936507936
plt.plot(fpr2, tpr2,'r-',label = 'SVM w/ Hyperparameter Tuning')
plt.plot([0,1],[0,1],'k-',label='random')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
AUC2=roc_auc_score(y_test, y_pred_prob)
print("AUC on test data = %f" % SVM_AUC2)
plt.grid()
plt.show()
AUC on test data = 0.970000
fig = plt.figure(figsize=(10,10))
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning'], [SVM_fbeta1, SVM_fbeta2,],marker='x',label='F2-Score')
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning'],[SVM_R1,SVM_R2],marker='x',label='Recall')
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning'],[SVM_P1,SVM_P2],marker='x',label='precision')
plt.plot(['Without Hyperparameter Tuning', 'With Hyperparameter Tuning'],[SVM_AUC1,SVM_AUC2],marker='x',label='AUC')
plt.title('Support Vector Machine')
plt.legend()
plt.grid()
plt.show()
Evaluation and comparision of all the models
models = pd.DataFrame({
'Model': ['K - Nearest Neighbors' , 'Naive Bayes', 'Logistic Regression', 'Decision Tree', 'Support Vector Machines', ],
'Score': [Knn_recall2, NB_recall2, R3,DT_recall3,SVM_R2]})
models.sort_values(by='Score', ascending=False)
| Model | Score | |
|---|---|---|
| 2 | Logistic Regression | 0.944444 |
| 3 | Decision Tree | 0.888889 |
| 0 | K - Nearest Neighbors | 0.833333 |
| 4 | Support Vector Machines | 0.833333 |
| 1 | Naive Bayes | 0.722222 |